print(Sys.time())

options(stringsAsFactors=F)

suppressPackageStartupMessages(library(devtools))
suppressPackageStartupMessages(library(stm))
suppressPackageStartupMessages(library(readr))
suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(tm))
suppressPackageStartupMessages(library(Matrix))
suppressPackageStartupMessages(library(glmnet))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(text2vec))
suppressPackageStartupMessages(library(randomForest))

## install_github("wilryh/parrot", dependencies=TRUE)
suppressPackageStartupMessages(library(parrot))

load(
    file="data/en_tweets_2009-05-09_to_2016-11-08_dimensions_election16_remove_news_all_docs_only_replicate_psrm.RData"
)
docs <- document_scores_election16

labeled_tweets <- read.csv(
    "data/fulloutput_900x3_450LR.csv", colClasses="character"
)
load("data/just_ids_900.Rda")
just_ids_900$tweet_id <- as.character(just_ids_900$tweet_id_NEW)

labeled_tweets <- left_join(
    labeled_tweets,
    just_ids_900,
    by="tweet_id"
)
labeled_tweets$tweet_id <- labeled_tweets$tweet_id_ORIG

entertainment <- aggregate(category=="entertainment" ~ tweet_id, data = labeled_tweets, FUN = mean)
names(entertainment)[2] <- "entertainment"

politics <- aggregate(category=="politics" ~ tweet_id, data = labeled_tweets, FUN = mean)
names(politics)[2] <- "politics"

social_justice <- aggregate(category=="social_justice" ~ tweet_id, data = labeled_tweets, FUN = mean)
names(social_justice)[2] <- "social_justice"

comb <- list(entertainment, politics, social_justice) %>%
        reduce(full_join, by="tweet_id")



ordered_docs <- left_join(docs, comb %>% rename(tweetid = tweet_id), by="tweetid")


mod_social_justice <- cv.glmnet(
    x=as.matrix(ordered_docs[!is.na(ordered_docs$social_justice),paste0("X", 0:9)]),
    y=ordered_docs[!is.na(ordered_docs$social_justice),]$social_justice>=0.5,
    family="binomial"
)
summary(mod_social_justice$cvm)
ordered_docs$social_justice_pred <- predict(
	mod_social_justice,
	newx=as.matrix(ordered_docs[,paste0("X", 0:9)]),
	s="lambda.min", type="response"
)

mod_politics <- cv.glmnet(
    x=as.matrix(ordered_docs[!is.na(ordered_docs$politics),paste0("X", 0:9)]),
    y=ordered_docs[!is.na(ordered_docs$politics),]$politics>=0.5,
    family="binomial"
)
summary(mod_politics$cvm)
ordered_docs$politics_pred <- predict(
	mod_politics,
	newx=as.matrix(ordered_docs[,paste0("X", 0:9)]),
	s="lambda.min", type="response"
)

mod_entertainment <- cv.glmnet(
    x=as.matrix(ordered_docs[!is.na(ordered_docs$entertainment),paste0("X", 0:9)]),
    y=ordered_docs[!is.na(ordered_docs$entertainment),]$entertainment>=0.5,
    family="binomial"
)
summary(mod_entertainment$cvm)
ordered_docs$entertainment_pred <- predict(
	mod_entertainment,
	newx=as.matrix(ordered_docs[,paste0("X", 0:9)]),
	s="lambda.min", type="response"
)


ordered_docs_short <- ordered_docs[,-which(names(ordered_docs) %in% paste0("X", 0:99))]

save(
    ordered_docs_short,
    file="data/code_test_labels_2015_replicate_psrm.RData"
)

#### GLOVE
## load(file="~/workspace/twitter_russia/replication_data/russia_tweets_out_docs_vocab_election16_replicate_low_thresh0.RData")
load(file="data/russia_tweets_out_election16_replicate_psrm_low_thresh0.RData")
load(file="data/labeled_troll_accounts_meta_replicate_psrm.RData")

tokens <- lapply(
    out_election16$documents[meta_election16$tweetid %in% subset(document_scores_election16, not_holdout)$tweetid],
    function(x) out_election16$vocab[x[1,]]
)

it = itoken(tokens, progressbar = FALSE)
vocab <- create_vocabulary(
    it
    )

vocab <- prune_vocabulary(vocab, term_count_min = 1L)

vectorizer <- vocab_vectorizer(vocab)
tcm <- create_tcm(it, vectorizer)
## ncol(tdm)
## ncol(tcm)
dtm <- create_dtm(it, vectorizer)

set.seed(987654321)
glove <- GlobalVectors$new(
                           word_vectors_size = 100, vocabulary = vocab,
                           x_max=5, alpha=0.75
                      )
glove_vectors <- fit_transform(tcm, glove, n_iter = 50)

glove_scores <- dtm %*% glove_vectors
glove_scores <- glove_scores / Matrix::rowSums(dtm)^(1)

glove_meta <- data.frame(
    meta_election16[meta_election16$tweetid %in% subset(document_scores_election16, not_holdout)$tweetid,],
    as.matrix(glove_scores)
)
glove_meta <- subset(glove_meta, !not_english)


glove_meta <- left_join(glove_meta, comb %>% rename(tweetid = tweet_id), by="tweetid")


mod_social_justice <- cv.glmnet(
    x=as.matrix(glove_meta[!is.na(glove_meta$social_justice),paste0("X", 1:100)]),
    y=glove_meta[!is.na(glove_meta$social_justice),]$social_justice>=0.5,
    family="binomial"## , type.measure="auc"
)
summary(mod_social_justice$cvm)
glove_meta$social_justice_pred <- predict(
	mod_social_justice,
	newx=as.matrix(glove_meta[,paste0("X", 1:100)]),
	s="lambda.min", type="response"
)

mod_politics <- cv.glmnet(
    x=as.matrix(glove_meta[!is.na(glove_meta$politics),paste0("X", 1:100)]),
    y=glove_meta[!is.na(glove_meta$politics),]$politics>=0.5,
    family="binomial"## , type.measure="auc"
)
summary(mod_politics$cvm)
glove_meta$politics_pred <- predict(
	mod_politics,
	newx=as.matrix(glove_meta[,paste0("X", 1:100)]),
	s="lambda.min", type="response"
)

mod_entertainment <- cv.glmnet(
    x=as.matrix(glove_meta[!is.na(glove_meta$entertainment),paste0("X", 1:100)]),
    y=glove_meta[!is.na(glove_meta$entertainment),]$entertainment>=0.5,
    family="binomial"## , type.measure="auc"
)
summary(mod_entertainment$cvm)
glove_meta$entertainment_pred <- predict(
	mod_entertainment,
	newx=as.matrix(glove_meta[,paste0("X", 1:100)]),
	s="lambda.min", type="response"
)


glove_meta_short <- glove_meta[,-which(names(glove_meta) %in% paste0("X", 1:100))]



glove_meta$class <- ""
glove_meta$class[glove_meta$social_justice>=0.5] <- "social_justice"
glove_meta$class[glove_meta$entertainment>=0.5] <- "entertainment"
glove_meta$class[glove_meta$politics>=0.5] <- "politics"
glove_meta$class <- factor(glove_meta$class)

r_mod <- randomForest(
    y = as.factor(as.numeric(glove_meta[!is.na(glove_meta$social_justice),]$class)),
    x= (
        glove_meta[!is.na(glove_meta$social_justice),paste0("X", 1:100)]
    )
)

glove_meta$rf_pred <- predict(
	r_mod, newdata=as.matrix(glove_meta[,paste0("X", 1:100)]))
levels(glove_meta$rf_pred) <- levels(glove_meta$class)

glove_meta_short <- glove_meta[,-which(names(glove_meta) %in% paste0("X", 1:100))]

save(
    glove_meta_short,
    file="data/code_test_labels_glove_2015_replicate_psrm.RData"
)


print(Sys.time())
